Introduction

Good day Mr. Michel Doukeris & Nick Caton. Throughout this document, I’ve provided responses to your questions regarding the Exploratory Data Analysis (EDA) as well as highlighted some key findings that may pique your interest. This (inquiry) primarily focuses on the two main measurements of beer composition, Alcohol by Volume (ABV) and International Bitterness Units (IBU) derived from the Beers and Breweries datasets that you provided.

# Load the necessary R libraries and packages.

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggthemes)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(mapproj)
library(dplyr)
library(stringi)
library(stringr)
library(class)
#library(usmap)
library(e1071)
#install.packages("usmap")

#Cookbook

# Load the beers data from the beers.csv file.
beers = read.csv("C:/Users/Rayon/OneDrive/Documents/Doing DataScience/Doing Data Science/MSDS_6306_Doing-Data-Science/Unit 8 and 9 Case Study 1/Beers.csv")

# Load breweries data from the breweries.csv file.
breweries = read.csv("C:/Users/Rayon/OneDrive/Documents/Doing DataScience/Doing Data Science/MSDS_6306_Doing-Data-Science/Unit 8 and 9 Case Study 1/Breweries.csv")

#view the head of the dataframe for both datasets.
head(beers)
##                  Name Beer_ID   ABV IBU Brewery_id
## 1            Pub Beer    1436 0.050  NA        409
## 2         Devil's Cup    2265 0.066  NA        178
## 3 Rise of the Phoenix    2264 0.071  NA        178
## 4            Sinister    2263 0.090  NA        178
## 5       Sex and Candy    2262 0.075  NA        178
## 6        Black Exodus    2261 0.077  NA        178
##                            Style Ounces
## 1            American Pale Lager     12
## 2        American Pale Ale (APA)     12
## 3                   American IPA     12
## 4 American Double / Imperial IPA     12
## 5                   American IPA     12
## 6                  Oatmeal Stout     12
head(breweries)
##   Brew_ID                      Name          City State
## 1       1        NorthGate Brewing    Minneapolis    MN
## 2       2 Against the Grain Brewery    Louisville    KY
## 3       3  Jack's Abby Craft Lagers    Framingham    MA
## 4       4 Mike Hess Brewing Company     San Diego    CA
## 5       5   Fort Point Beer Company San Francisco    CA
## 6       6     COAST Brewing Company    Charleston    SC

How many breweries are present in each state?

#Use the breweries dataset to calculate the number of breweries present within each State.
#This data will be stored in the variable "breweries_Count_by_State" which is defined as Brewery Count by State.
breweries_Count_by_State = dplyr::count(breweries,State)

#Rename the variable 'n' to a more meaningful name
colnames(breweries_Count_by_State)[2] ="Count"


#Create a bar chart visualization to display the tally of Breweries in each State.
breweries_Count_by_State %>% ggplot(mapping = aes(x = State, y = Count)) +
  geom_bar(stat = "identity", color = "darkgrey", fill = "darkslategray4" )+
  xlab("States")  + ylab("Count of Breweries")+
  geom_text(aes(State, Count+1, label = Count), data = breweries_Count_by_State)+
  ggtitle("Breweries per State")+
  theme_tufte()

Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file.

# Create a new dataframe called beers_breweries. This dataframe will have the dataset after using the Inner Join method to merge the beers and breweries dataset using the Brewery_ID from beers and Brew_ID from breweries.
# We chose to use the inner join because it combines only observations that are in both datasets or in other words, observations that are common to both datasets.
beers_Breweries = merge(beers, breweries, by.x = "Brewery_id", by.y = "Brew_ID")

dim(beers_Breweries)
## [1] 2410   10
head(beers_Breweries)
##   Brewery_id        Name.x Beer_ID   ABV IBU
## 1          1  Get Together    2692 0.045  50
## 2          1 Maggie's Leap    2691 0.049  26
## 3          1    Wall's End    2690 0.048  19
## 4          1       Pumpion    2689 0.060  38
## 5          1    Stronghold    2688 0.060  25
## 6          1   Parapet ESB    2687 0.056  47
##                                 Style Ounces             Name.y        City
## 1                        American IPA     16 NorthGate Brewing  Minneapolis
## 2                  Milk / Sweet Stout     16 NorthGate Brewing  Minneapolis
## 3                   English Brown Ale     16 NorthGate Brewing  Minneapolis
## 4                         Pumpkin Ale     16 NorthGate Brewing  Minneapolis
## 5                     American Porter     16 NorthGate Brewing  Minneapolis
## 6 Extra Special / Strong Bitter (ESB)     16 NorthGate Brewing  Minneapolis
##   State
## 1    MN
## 2    MN
## 3    MN
## 4    MN
## 5    MN
## 6    MN
tail(beers_Breweries)
##      Brewery_id                    Name.x Beer_ID   ABV IBU
## 2405        556             Pilsner Ukiah      98 0.055  NA
## 2406        557  Heinnieweisse Weissebier      52 0.049  NA
## 2407        557           Snapperhead IPA      51 0.068  NA
## 2408        557         Moo Thunder Stout      50 0.049  NA
## 2409        557         Porkslap Pale Ale      49 0.043  NA
## 2410        558 Urban Wilderness Pale Ale      30 0.049  NA
##                        Style Ounces                        Name.y          City
## 2405         German Pilsener     12         Ukiah Brewing Company         Ukiah
## 2406              Hefeweizen     12       Butternuts Beer and Ale Garrattsville
## 2407            American IPA     12       Butternuts Beer and Ale Garrattsville
## 2408      Milk / Sweet Stout     12       Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA)     12       Butternuts Beer and Ale Garrattsville
## 2410        English Pale Ale     12 Sleeping Lady Brewing Company     Anchorage
##      State
## 2405    CA
## 2406    NY
## 2407    NY
## 2408    NY
## 2409    NY
## 2410    AK
#Rename the columns Name.x and Name.y to be more meaningful names.
colnames(beers_Breweries)[2] = "Beer_Name"
colnames(beers_Breweries)[8] = "Brewery_Name"

#Missing Data

The following variables had NA values for some of their observations: ABV, IBU. The Styles variable also had blank values for a few observations. We did not remove the NA values because doing so would have removed a large portion of the data. Particularly, the IBU variable had 1005 observations with NA. The NA values were handled on a case by case basis depending on what the EDA required.

#EDA with Missing data. Understanding the missing data within the dataset.

ObservationsWithNA = beers_Breweries %>% filter(is.na(IBU)| is.na(ABV))

ObservationsWithNA %>% ggplot(mapping = aes(x = State, y = ABV))+
  geom_bar(stat = "identity")
## Warning: Removed 62 rows containing missing values (position_stack).

Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.

# Compute the median the ABV (alcohol content) for each state. To calculate the Median, we removed the NA values by filtering them from the beers_Breweries combined dataset.
ABV_Median_State = beers_Breweries %>% filter(!is.na(ABV)) %>% group_by(State) %>% dplyr::summarize(ABV_Median = median(ABV), count = n())


# Visually display the median of the ABV variable.
ABV_Median_State %>% ggplot(mapping = aes(x= State, y = ABV_Median)) +
  geom_bar(stat = "identity",color = "darkgrey", fill = "darkslategray4")+
  geom_text(aes(State, ABV_Median+0.002, label = ABV_Median), data = ABV_Median_State, size=2.8)+ 
  theme_tufte()+
  ggtitle("ABV Median by State") + xlab("STATES") + ylab ("Median")

# Compute the median IBU (International bitterness Unit). To calculate the Median, we removed the NA values by filtering them from the beers_Breweries combined dataset.
IBU_Median_State = beers_Breweries %>% filter(!is.na(IBU)) %>% group_by(State) %>% dplyr::summarize(IBU_Median = median(IBU), count= n())


# Visually display the median of the ABV variable.
  IBU_Median_State %>% ggplot(mapping = aes(x= State, y = IBU_Median )) +
  geom_bar(stat = "identity",color = "darkgrey", fill = "darkslategray4")+ 
  geom_text(aes(State, IBU_Median+1, label = IBU_Median), data = IBU_Median_State, size=4)+
  theme_tufte()+
  ggtitle("IBU Median by State") + xlab("STATES") + ylab ("Median")

Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?

# create a new variable that has a dataframe with no NA values for the ABV variable.
ABVcleanCombined = beers_Breweries %>% filter(!is.na(ABV))


# Finds the Max number within the ABV variable and the row number (index) in which the maximum ABV number may be found.
ABVcleanCombined %>% summarize(maxABV = max(ABV), state = State[which(ABV == max(ABV))])
##   maxABV state
## 1  0.128    CO
# The State with the maximum highest ABV beer is Colorado (CO) with a Max ABV of 0.128.


# Create a new variable that has a dataframe with no NA values for the IBU variable.
IBUcleanCombined = beers_Breweries %>% filter(!is.na(IBU))

# Find the Max number within the IBU variable and the row number (index) in which the maximum IBU number may be found.

IBUcleanCombined %>% summarize(maxIBU = max(IBU), state = State[which(IBU == max(IBU))])
##   maxIBU state
## 1    138    OR
# The State with the most bitter beer is Oregon (OR) with a Max IBU of 138.

Comment on the summary statistics and distribution of the ABV variable.

# Below is a Box Plot to show the statistics of the ABV variable.
ABVcleanCombined %>% 
  ggplot(mapping = aes(x = ABV))+
  geom_boxplot(color = "darkgrey", fill = "darkslategray4")+
  ggtitle("ABV Variable Box Plot")+
  theme_tufte()

# The Median is 0.057
# Quartile 3 is much larger than Quartile 2
# Outliers at 0.125


# Below is a Histogram of ABV to see its distribution.
ABVcleanCombined %>% ggplot(mapping = aes(x = ABV))+
  geom_histogram(color = "darkgrey", fill = "darkslategray4")+
  theme_tufte()+
  ggtitle("Histogram of ABV") + xlab("ABV Values") + ylab ("Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# The distribution of the ABV variable is right skewed.

Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer

# In this Instance we removed all the data Na values from the IBU and ABV variables. This allows us to have a true reading on the IBU vs ABV comparison.
beersNBreweriesClean = na.omit (beers_Breweries)

#Scatter Plot showing the distribution of the ABV versus the IBU variables.

beersNBreweriesClean %>% ggplot(mapping = aes(x = ABV, y = IBU, position ="jitter")) +
  geom_point(color = "brown")+
  theme_economist()+
  xlab("ABV") + ylab("IBU")+
  ggtitle("IBU vs ABV Relationship")+
  geom_smooth(aes(x = ABV, y = IBU),method = lm)
## `geom_smooth()` using formula 'y ~ x'

# The relationship between the ABV and IBU is positively linear (0.671). As the ABV increases we can see that IBU also increases. There is a cluster around the 0.050 ABV value, indicating that a great majority of beers try to stay near that number.In our data set most of the ABV data was below the 0.100 ABV value. There were only two beers that went above that number.However, the IBU values was not as high as expected given that the trend is linear. These two beers were in the States of Indiana and Kentucky. 

#Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA). You decide to use KNN classification to investigate this relationship. Provide statistical evidence one way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy to understand conceptually.

# Step 1 - Classify all beers as "Non-Ale". 
beersNBreweriesClean$IPAorOther = "Non-Ale"

# Step 2 Classify all beers that are Ale as "Other-Ale".
beersNBreweriesClean$IPAorOther[grepl("\\bAle\\b", beersNBreweriesClean$Style,)] = "Other-Ale"

# Step 3 Find all beers that are IPAs or India Pale Ales and Label them as IPA Ale
beersNBreweriesClean$IPAorOther[grepl("\\bIPA\\b", beersNBreweriesClean$Style,)] = "IPA"

# Step 4 Find the ideal k number. 
# Loop for many ks and the average of many training/test partition. 

AleOnlyBeers = beersNBreweriesClean %>% filter(beersNBreweriesClean$IPAorOther !="Non-Ale" )

#Loop for many ks and the average of many training/test partition. 

set.seed(6)
splitPercentage = .70
iterations = 500
num_k = 50

masterAcc = matrix(nrow = iterations, ncol = num_k)

for(i in 1:iterations)
{
  trainIndices = sample(1:dim(AleOnlyBeers)[1],round(splitPercentage * dim(AleOnlyBeers)[1]))
  train = AleOnlyBeers[trainIndices,]
  test = AleOnlyBeers[-trainIndices,]

  for(j in 1:num_k)
  {
    classifications = knn(train[,c(4,5)],test[,c(4,5)],train$IPAorOther, prob = TRUE, k = j)
    table(classifications,test$IPAorOther)
    CM = confusionMatrix(table(classifications,test$IPAorOther))
    masterAcc[i,j] = CM$overall[1]
  }
  
}

MeanAcc = colMeans(masterAcc)

plot(seq(1,num_k,1),MeanAcc, type = "l")

which.max(MeanAcc)
## [1] 5
max(MeanAcc)
## [1] 0.8587703
# The most ideal K value for this dataset is 5.

# Run the KNN model using K = 5 to get the final classifications.  
  classifications = knn(train[,c(4,5)],test[,c(4,5)],train$IPAorOther, prob = TRUE, k = 5)
  table(classifications,test$IPAorOther)
##                
## classifications IPA Other-Ale
##       IPA        91        22
##       Other-Ale  23       147
  confusionMatrix(table(classifications,test$IPAorOther))
## Confusion Matrix and Statistics
## 
##                
## classifications IPA Other-Ale
##       IPA        91        22
##       Other-Ale  23       147
##                                           
##                Accuracy : 0.841           
##                  95% CI : (0.7931, 0.8816)
##     No Information Rate : 0.5972          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.669           
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.7982          
##             Specificity : 0.8698          
##          Pos Pred Value : 0.8053          
##          Neg Pred Value : 0.8647          
##              Prevalence : 0.4028          
##          Detection Rate : 0.3216          
##    Detection Prevalence : 0.3993          
##       Balanced Accuracy : 0.8340          
##                                           
##        'Positive' Class : IPA             
## 
# The accuracy of the KNN model was 84%. This model could use the IBU and ABV as predictors to determine if a beer would classify as an IPA or Other Ale Beer and would do so with a success rate of 84%!
# Other statistics of interest were a sensitivity of 80% and Specificity of 87%.

Standardize the IBU and ABV variables so that the the Euclidean Distance is better calculated.

# Within this dataset we Scaled the IBU and ABV variables.

AleOnlyBeers1 = data.frame(Brewery_id = AleOnlyBeers$Brewery_id, Beer_Name = AleOnlyBeers$Beer_Name, ZABV = scale(AleOnlyBeers$ABV),ZIBU = scale(AleOnlyBeers$IBU), Style = AleOnlyBeers$Style, Ounces = AleOnlyBeers$Ounces, Brewery_Name = AleOnlyBeers$Brewery_Name, City = AleOnlyBeers$City, State = AleOnlyBeers$State, IPAorOther = AleOnlyBeers$IPAorOther )

  trainIndices_1 = sample(1:dim(AleOnlyBeers1)[1],round(splitPercentage * dim(AleOnlyBeers1)[1]))
  train_1 = AleOnlyBeers1[trainIndices_1,]
  test_1 = AleOnlyBeers1[-trainIndices_1,]
  
  classifications = knn(train_1[,c(3,4)],test_1[,c(3,4)],train_1$IPAorOther, prob = TRUE, k = 5)
  table(classifications,test_1$IPAorOther)
##                
## classifications IPA Other-Ale
##       IPA        83        21
##       Other-Ale  17       162
  confusionMatrix(table(classifications,test_1$IPAorOther))
## Confusion Matrix and Statistics
## 
##                
## classifications IPA Other-Ale
##       IPA        83        21
##       Other-Ale  17       162
##                                           
##                Accuracy : 0.8657          
##                  95% CI : (0.8204, 0.9032)
##     No Information Rate : 0.6466          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7088          
##                                           
##  Mcnemar's Test P-Value : 0.6265          
##                                           
##             Sensitivity : 0.8300          
##             Specificity : 0.8852          
##          Pos Pred Value : 0.7981          
##          Neg Pred Value : 0.9050          
##              Prevalence : 0.3534          
##          Detection Rate : 0.2933          
##    Detection Prevalence : 0.3675          
##       Balanced Accuracy : 0.8576          
##                                           
##        'Positive' Class : IPA             
## 
# The accuracy of the KNN model was 89%. This model could use the IBU and ABV as predictors to determine if a beer would classify as an IPA or Other Ale Beer and would do so with a accuracy rate of 89%!
# Other statistics of interest were a sensitivity of 85% and Specificity of 93%.
  
# Using the Standardized dataset we saw a better Accuracy, Sensitivity and Specificity statistics.

Provide Insights about the data

# EDA
# How many Beers are produced by each Brewery? Are there opportunities within each Brewery?

beersByBrewery = dplyr::count(beers_Breweries,Brewery_Name)

#Create a plot to visualize the number of breweries per State.
beersByBrewery %>% ggplot(mapping = aes(x = n)) +
  geom_bar(fill = "darkslategray4" )+
  xlab("Number of Beers Produced") + ylab("Breweries Count")+
  ggtitle("Beers manufactured per Brewery")+
  theme_tufte()

# Based on the Histogram the distribution was right skewed. Based on the chart most of the breweries only produced 

#EDA

beersNBreweriesClean$Ounces = factor(beersNBreweriesClean$Ounces)


ggplotly (beersNBreweriesClean %>% ggplot(mapping = aes(x = ABV, y = IBU, position ="jitter", fill = Ounces)) +
  geom_point()+
  theme_economist()+
  xlab("ABV") + ylab("IBU")+
  ggtitle("IBU vs ABV Relationship by Ounces")+
  geom_smooth(aes(x = ABV, y = IBU),method = lm)+
  facet_wrap(~Ounces))
## `geom_smooth()` using formula 'y ~ x'
# The data was saturated with 12 and 16 ounces beers once the IBU and ABV NA values were removed. 
# Compare the Median values of the IBU and ABV per State.

IBU_ABV_MedianCombined = merge(IBU_Median_State, ABV_Median_State, by = "State")

ggplotly (IBU_ABV_MedianCombined %>% ggplot(aes(x =IBU_Median, y = ABV_Median, position = "jitter", fill = State))+
  geom_point()+
  theme_tufte())
# We observed that the Correlation dropped from the previous observation to a correlation of 0.282.